In [ ]:
%matplotlib inline

import time
import calendar
import codecs
import datetime
import json
import sys
import gzip
import string
import glob
import requests
import os

import numpy as np

Twitter Crisis Anlytics

The following notebook walks us through a number of capabilities or common pieces of functionality one may want when analyzing Twitter following a crisis. We will start by defining information for a set of events for which we have data.


In [ ]:
crisisInfo = {
    "boston": {
        "name": "Boston Marathon Bombing",
        "time": 1366051740, # Timestamp in seconds since 1/1/1970, UTC
                            # 15 April 2013, 14:49 EDT -> 18:49 UTC
        "directory": "boston",
        "keywords": ["boston", "exploision", "bomb", "marathon"],
        "box": { # Bounding box for geographic limits
            "lowerLeftLon": -124.848974,
            "lowerLeftLat": 24.396308,
            "upperRightLon": -66.885444,
            "upperRightLat": 49.384358,
        }
    },
    
    "paris_hebdo": {
        "name": "Charlie Hebdo Attack",
        "time": 1420626600, # Timestamp in seconds since 1/1/1970, UTC
                            # 7 January 2015, 11:30 CET -> 10:30 UTC
        "directory": "paris_hebdo",
        "keywords": ["paris", "hebdo"],
        "box": {
            "lowerLeftLon": -5.1406,
            "lowerLeftLat": 41.33374,
            "upperRightLon": 9.55932,
            "upperRightLat": 51.089062,
        }
    },
    
    "nepal": {
        "name": "Nepal Earthquake",
        "time": 1429942286, # Timestamp in seconds since 1/1/1970, UTC
                            # 25 April 2015, 6:11:26 UTC
        "directory": "nepal",
        "keywords": ["nepal", "earthquake", "quake", "nsgs"],
        "box": {
            "lowerLeftLon": 80.0562,
            "lowerLeftLat": 26.3565,
            "upperRightLon": 88.1993,
            "upperRightLat": 30.4330,
        }
    },
    
    "paris_nov": {
        "name": "Paris November Attacks",
        "time": 1447446000, # Timestamp in seconds since 1/1/1970, UTC
                            # 13 November 2015, 20:20 UTC to 23:58 UTC
        "directory": "paris_nov",
        "keywords": ["paris", "shots", "explosion"],
        "box": {
            "lowerLeftLon": -5.1406,
            "lowerLeftLat": 41.33374,
            "upperRightLon": 9.55932,
            "upperRightLat": 51.089062,
        }
    },
    
    "brussels": {
        "name": "Brussels Transit Attacks",
        "time": 1458629880, # Timestamp in seconds since 1/1/1970, UTC
                            # 22 March 2016, 6:58 UTC to 08:11 UTC
        "directory": "brussels",
        "keywords": ["brussels", "bomb", "belgium", "explosion"],
        "box": {
            "lowerLeftLon": 2.54563,
            "lowerLeftLat": 49.496899,
            "upperRightLon": 6.40791,
            "upperRightLat": 51.5050810,
        }
    },
}

Choose Your Crisis

Since we have several disasters we can look at and don't have time to explore them all, you can pick one and follow along with our analysis on the crisis that interests you.

To select the crisis you want, pick from the list printed below.


In [ ]:
print ("Available Crisis Names:")
for k in sorted(crisisInfo.keys()):
    print ("\t", k)

In [ ]:
# Replace the name below with your selected crisis
selectedCrisis = "nepal"

Topic 3.1: Reading Tweets

The first thing we do is read in tweets from a directory of compressed files. Our collection of compressed tweets is in the 00_data directory, so we'll use pattern matching (called "globbing") to find all the tweet files in the given directory.

Then, for each file, we'll open it, read each line (which is a tweet in JSON form), and build an object out of it. As part of this process, we will extract each tweet's post time and create a map from minute timestamps to the tweets posted during that minute.


In [ ]:
# Determine host-specific location of data
tweetDirectory = crisisInfo[selectedCrisis]["directory"]
tweetGlobPath = os.path.join("..", "00_data", tweetDirectory, "statuses.log.*.gz")

print ("Reading files from:", tweetGlobPath)

# Dictionary for mapping dates to data
frequencyMap = {}

# For counting tweets
globalTweetCounter = 0

# Twitter's time format, for parsing the created_at date
timeFormat = "%a %b %d %H:%M:%S +0000 %Y"

reader = codecs.getreader("utf-8")

for tweetFilePath in glob.glob(tweetGlobPath):
    print ("Reading File:", tweetFilePath)

    for line in gzip.open(tweetFilePath, 'rb'):

        # Try to read tweet JSON into object
        tweetObj = None
        try:
            tweetObj = json.loads(reader.decode(line)[0])
        except Exception as e:
            continue

        # Deleted status messages and protected status must be skipped
        if ( "delete" in tweetObj.keys() or "status_withheld" in tweetObj.keys() ):
            continue

        # Try to extract the time of the tweet
        try:
            currentTime = datetime.datetime.strptime(tweetObj['created_at'], timeFormat)
        except:
            print (line)
            raise

        currentTime = currentTime.replace(second=0)

        # Increment tweet count
        globalTweetCounter += 1

        # If our frequency map already has this time, use it, otherwise add
        if ( currentTime in frequencyMap.keys() ):
            timeMap = frequencyMap[currentTime]
            timeMap["count"] += 1
            timeMap["list"].append(tweetObj)
        else:
            frequencyMap[currentTime] = {"count":1, "list":[tweetObj]}

# Fill in any gaps
times = sorted(frequencyMap.keys())
firstTime = times[0]
lastTime = times[-1]
thisTime = firstTime

# We want to look at per-minute data, so we fill in any missing minutes
timeIntervalStep = datetime.timedelta(0, 60)    # Time step in seconds
while ( thisTime <= lastTime ):
    if ( thisTime not in frequencyMap.keys() ):
        frequencyMap[thisTime] = {"count":0, "list":[]}
        
    thisTime = thisTime + timeIntervalStep

print ("Processed Tweet Count:", globalTweetCounter)


Topic 4: Simple Frequency Analysis

In this section, we will cover a few simple analysis techniques to garner some small insights rapidly.

  • Frequency Graph
  • Top users
  • Top hash tags
  • Top URLs
  • Top images
  • Most retweeted tweet
  • Keyword Frequency

Twitter Timeline

To build a timeline of Twitter usage, we can simply plot the number of tweets posted per minute.


In [ ]:
import matplotlib.pyplot as plt

crisisMoment = crisisInfo[selectedCrisis]["time"]
crisisTime = datetime.datetime.utcfromtimestamp(crisisMoment)
crisisTime = crisisTime.replace(second=0)
print ("Crisis Time:", crisisTime)

fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)

plt.title("Tweet Frequency")

# Sort the times into an array for future use
sortedTimes = sorted(frequencyMap.keys())

# What time span do these tweets cover?
print ("Time Frame:", sortedTimes[0], sortedTimes[-1])

# Get a count of tweets per minute
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]

# We'll have ticks every few minutes (more clutters the graph)
smallerXTicks = range(0, len(sortedTimes), 10)
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

# Plot the post frequency
yData = [x if x > 0 else 0 for x in postFreqList]
ax.plot(range(len(frequencyMap)), yData, color="blue", label="Posts")

crisisXCoord = sortedTimes.index(crisisTime)
ax.scatter([crisisXCoord], [np.mean(yData)], c="r", marker="x", s=100, label="Crisis")

ax.grid(b=True, which=u'major')
ax.legend()

plt.show()

Top Twitter Users

Finding good sources of information is really important during crises. On Twitter, the loudest or most prolific users are not necessarily good sources though. We first check who these prolific users are by determining who was tweeting the most during this particular time span.


In [ ]:
# Create maps for holding counts and tweets for each user
globalUserCounter = {}
globalUserMap = {}

# Iterate through the time stamps
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    # For each tweet, pull the screen name and add it to the list
    for tweet in timeObj["list"]:
        user = tweet["user"]["screen_name"]
        
        if ( user not in globalUserCounter ):
            globalUserCounter[user] = 1
            globalUserMap[user] = [tweet]
        else:
            globalUserCounter[user] += 1
            globalUserMap[user].append(tweet)

print ("Unique Users:", len(globalUserCounter.keys()))

In [ ]:
sortedUsers = sorted(globalUserCounter, key=globalUserCounter.get, reverse=True)
print ("Top Ten Most Prolific Users:")
for u in sortedUsers[:10]:
    print (u, globalUserCounter[u], 
           "\n\t", "Random Tweet:", globalUserMap[u][0]["text"], "\n----------")

Many of these tweets are not relevant to the event at hand. Twitter is a very noisy place.

Hashtags, however, are high signal keywords. Maybe the most common hashtags will be more informative.


In [ ]:
# A map for hashtag counts
hashtagCounter = {}

# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        hashtagList = tweet["entities"]["hashtags"]
        
        for hashtagObj in hashtagList:
            
            # We lowercase the hashtag to avoid duplicates (e.g., #MikeBrown vs. #mikebrown)
            hashtagString = hashtagObj["text"].lower()
            
            if ( hashtagString not in hashtagCounter ):
                hashtagCounter[hashtagString] = 1
            else:
                hashtagCounter[hashtagString] += 1

print ("Unique Hashtags:", len(hashtagCounter.keys()))
sortedHashtags = sorted(hashtagCounter, key=hashtagCounter.get, reverse=True)
print ("Top Twenty Hashtags:")
for ht in sortedHashtags[:20]:
    print ("\t", "#" + ht, hashtagCounter[ht])

We can do the same with URLs to find the most shared URL.


In [ ]:
# A map for hashtag counts
urlCounter = {}

# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        urlList = tweet["entities"]["urls"]
        
        for url in urlList:
            urlStr = url["url"]
            
            if ( urlStr not in urlCounter ):
                urlCounter[urlStr] = 1
            else:
                urlCounter[urlStr] += 1

print ("Unique URLs:", len(urlCounter.keys()))
sortedUrls = sorted(urlCounter, key=urlCounter.get, reverse=True)
print ("Top Twenty URLs:")
for url in sortedUrls[:20]:
    print ("\t", url, urlCounter[url])

Note how each URL is shortened using Twitter's shortener. To get a better idea of the content, we should expand the url.


In [ ]:
print ("Top Expanded URLs:")
for url in sortedUrls[:10]:
    try:
        r = requests.get(url)
        realUrl = r.url
        print ("\t", url, urlCounter[url], "->", realUrl)
    except:
        print ("\t", url, urlCounter[url], "->", "UNKNOWN Failure")

Since URLs and Hashtags are both entities, we can do the same for other entities, like mentions and media.


In [ ]:
# A map for mention counts
mentionCounter = {}

# For each minute, pull the list of mentions and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        mentions = tweet["entities"]["user_mentions"]
        
        for mention in mentions:
            mentionStr = mention["screen_name"]
            
            if ( mentionStr not in mentionCounter ):
                mentionCounter[mentionStr] = 1
            else:
                mentionCounter[mentionStr] += 1

print ("Unique Mentions:", len(mentionCounter.keys()))
sortedMentions = sorted(mentionCounter, key=mentionCounter.get, reverse=True)
print ("Top Twenty Mentions:")
for mention in sortedMentions[:20]:
    print ("\t", mention, mentionCounter[mention])

In [ ]:
# A map for media counts
mediaCounter = {}

# For each minute, pull the list of media and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        if ( "media" not in tweet["entities"] ):
            continue
            
        mediaList = tweet["entities"]["media"]
        
        for media in mediaList:
            mediaStr = media["media_url"]
            
            if ( mediaStr not in mediaCounter ):
                mediaCounter[mediaStr] = 1
            else:
                mediaCounter[mediaStr] += 1

print ("Unique Media:", len(mediaCounter.keys()))
sortedMedia = sorted(mediaCounter, key=mediaCounter.get, reverse=True)
print ("Top Twenty Media:")
for media in sortedMedia[:20]:
    print ("\t", media, mediaCounter[media])

We can some data is relevant, both in pictures and in hashtags and URLs. Are the most retweeted retweets also useful? Or are they expressing condolence? Or completely unrelated?


In [ ]:
# A map for media counts
tweetRetweetCountMap = {}
rtList = []

# For each minute, pull the list of hashtags and add to the counter
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    for tweet in timeObj["list"]:
        tweetId = tweet["id_str"]
        rtCount = tweet["retweet_count"]
        
        if ( "retweeted_status" in tweet ):
            tweetId = tweet["retweeted_status"]["id_str"]
            rtCount = tweet["retweeted_status"]["retweet_count"]
            
        tweetRetweetCountMap[tweetId] = rtCount
        rtList.append(rtCount)
        
sortedRetweets = sorted(tweetRetweetCountMap, key=tweetRetweetCountMap.get, reverse=True)
print ("Top Ten Retweets:")
for tweetId in sortedRetweets[:10]:
    thisTweet = None
    
    for t in reversed(sortedTimes):
        for tweet in frequencyMap[t]["list"]:
            if ( tweet["id_str"] == tweetId ):
                thisTweet = tweet
                break
                
            if ( "retweeted_status" in tweet and tweet["retweeted_status"]["id_str"] == tweetId ):
                thisTweet = tweet["retweeted_status"]
                break
                
        if ( thisTweet is not None ):
            break
    
    print ("\t", tweetId, tweetRetweetCountMap[tweetId], thisTweet["text"])

Retweets seem to be dominated by recent elements. To correct for this, we should remove retweets that are older than the event.


In [ ]:
print ("Top Ten RECENT Retweets:")

foundTweets = 0
for tweetId in sortedRetweets:
    thisTweet = None
    
    # Find the most recent copy of the tweet
    for t in reversed(sortedTimes):
        for tweet in frequencyMap[t]["list"]:
            if ( tweet["id_str"] == tweetId ):
                thisTweet = tweet
                break
                
            if ( "retweeted_status" in tweet and tweet["retweeted_status"]["id_str"] == tweetId ):
                thisTweet = tweet["retweeted_status"]
                break
                
        if ( thisTweet is not None ):
            break
    
    createdTime = datetime.datetime.strptime(thisTweet['created_at'], timeFormat)
    
    # If tweet creation time is before the crisis, assume irrelevant
    if ( createdTime < crisisTime ):
        continue
        
    print ("\t", tweetId, tweetRetweetCountMap[tweetId], thisTweet["text"])
    
    foundTweets += 1
    
    if ( foundTweets > 10 ):
        break

Event Detection w/ Keyword Frequency

Twitter is good for breaking news. When an impactful event occurs, we often see a spike on Twitter of the usage of a related keyword. Some examples are below.


In [ ]:
# What keywords are we interested in?
targetKeywords = crisisInfo[selectedCrisis]["keywords"]

# Build an empty map for each keyword we are seaching for
targetCounts = {x:[] for x in targetKeywords}
totalCount = []

# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
    timeObj = frequencyMap[t]
    
    # Temporary counter for this minute
    localTargetCounts = {x:0 for x in targetKeywords}
    localTotalCount = 0
    
    for tweetObj in timeObj["list"]:
        tweetString = tweetObj["text"].lower()

        localTotalCount += 1
        
        # Add to the counter if the target keyword is in this tweet
        for keyword in targetKeywords:
            if ( keyword in tweetString ):
                localTargetCounts[keyword] += 1
                
    # Add the counts for this minute to the main counter
    totalCount.append(localTotalCount)
    for keyword in targetKeywords:
        targetCounts[keyword].append(localTargetCounts[keyword])
        
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)

plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.semilogy(range(len(frequencyMap)), totalCount, label="Total")

ax.scatter([crisisXCoord], [100], c="r", marker="x", s=100, label="Crisis")

for keyword in targetKeywords:
    ax.semilogy(range(len(frequencyMap)), targetCounts[keyword], label=keyword)
ax.legend()
ax.grid(b=True, which=u'major')

plt.show()


Time for a break!


Topic 5: Geographic Data

Data in social media can be relevant to an event in three ways: temporally relevant, geographically relevant, or topically relevant. So far, we've looked at temporally relevant data, or data that was posted at about the same time as the target event. Now we'll explore geographically relevant data, or data posted near the event.

Twitter allows users to share their GPS locations when tweeting, but only about 2% of tweets have this information. We can extract this geospatial data to look at patterns in different locations.

  • General plotting
  • Filtering by a bounding box
  • Images from target location

Plotting GPS Data

Each tweet has a field called "coordinates" describing from where the tweet was posted. The field might be null if the tweet contains no location data, or it could contain bounding box information, place information, or GPS coordinates in the form of (longitude, latitude). We want tweets with this GPS data.

For more information on tweet JSON formats, check out https://dev.twitter.com/overview/api/tweets


In [ ]:
# A frequency map for timestamps to geo-coded tweets
geoFrequencyMap = {}
geoCount = 0

# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
    geos = list(filter(lambda tweet: tweet["coordinates"] != None and 
                       "coordinates" in tweet["coordinates"], 
                       frequencyMap[t]["list"]))
    geoCount += len(geos)
    
    # Add to the timestamp map
    geoFrequencyMap[t] = {"count": len(geos), "list": geos}

print ("Number of Geo Tweets:", geoCount)

GPS Frequency

What is the frequency of GPS-coded tweets?


In [ ]:
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)

plt.title("Geo Tweet Frequency")

gpsFreqList = [geoFrequencyMap[x]["count"] for x in sortedTimes]
postFreqList = [frequencyMap[x]["count"] for x in sortedTimes]

plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

xData = range(len(geoFrequencyMap))
gpsYData = [x if x > 0 else 0 for x in gpsFreqList]
freqYData = [x if x > 0 else 0 for x in postFreqList]

ax.semilogy(xData, freqYData, color="blue", label="Posts")
ax.semilogy(xData, gpsYData, color="green", label="GPS Posts")
ax.scatter([crisisXCoord], [100], c="r", marker="x", s=100, label="Crisis")

ax.grid(b=True, which=u'major')
ax.legend()

plt.show()

Plotting GPS Data

Now that we have a list of all the tweets with GPS coordinates, we can plot from where in the world these tweets were posted. To make this plot, we can leverage the Basemap package to make a map of the world and convert GPS coordinates to (x, y) coordinates we can then plot.


In [ ]:
import matplotlib
import functools

from mpl_toolkits.basemap import Basemap

# Create a list of all geo-coded tweets
tmpGeoList = [geoFrequencyMap[t]["list"] for t in sortedTimes]
geoTweets = functools.reduce(lambda x, y: x + y, tmpGeoList)

# For each geo-coded tweet, extract its GPS coordinates
geoCoord = [x["coordinates"]["coordinates"] for x in geoTweets]

# Now we build a map of the world using Basemap
land_color = 'lightgray'
water_color = 'lightblue'

fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
                   llcrnrlon=-180, urcrnrlon=180, resolution='l')

worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
ax.set_title('World Tweets')

# Convert points from GPS coordinates to (x,y) coordinates
convPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in convPoints]
y = [p[1] for p in convPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)

plt.show()

Filtering By Location

We can use existing Geographic Information System (GIS) tools to determine from where a tweet was posted. For example, we could ask whether a particular tweet was posted from the United States. This filtering is often performed using shape files. For our purposes though, we established a bounding box along with the crisis data, so we'll use that as our filter for simplicity.


In [ ]:
# Get the bounding box for our crisis
bBox = crisisInfo[selectedCrisis]["box"]

fig, ax = plt.subplots(figsize=(11,8.5))

# Create a new map to hold the shape file data
targetMap = Basemap(llcrnrlon=bBox["lowerLeftLon"], 
                    llcrnrlat=bBox["lowerLeftLat"], 
                    urcrnrlon=bBox["upperRightLon"], 
                    urcrnrlat=bBox["upperRightLat"], 
                    projection='merc',
                    resolution='i', area_thresh=10000)

targetMap.fillcontinents(color=land_color, lake_color=water_color, 
                         zorder=1)
targetMap.drawcoastlines()
targetMap.drawparallels(np.arange(-90.,120.,30.))
targetMap.drawmeridians(np.arange(0.,420.,60.))
targetMap.drawmapboundary(fill_color=water_color, zorder=0)
targetMap.drawcountries()

# Now we build the polygon for filtering
# Convert from lon, lat of lower-left to x,y coordinates
llcCoord = targetMap(bBox["lowerLeftLon"], bBox["lowerLeftLat"])

# Same for upper-right corner
urcCoord = targetMap(bBox["upperRightLon"], bBox["upperRightLat"])

# Now make the polygon we'll us for filtering
boxPoints = np.array([[llcCoord[0], llcCoord[1]], 
                      [llcCoord[0], urcCoord[1]], 
                      [urcCoord[0], urcCoord[1]], 
                      [urcCoord[0], llcCoord[1]]])
boundingBox = matplotlib.patches.Polygon(boxPoints)

# Maps of timestamps to tweets for inside/outside Ferguson
inTargetFreqMap = {}
plottablePointsX = []
plottablePointsY = []

# For each geo-coded tweet, extract coordinates and convert 
# them to the Basemap space
for t in sortedTimes:
    geos = geoFrequencyMap[t]["list"]
    convPoints = [(targetMap(tw["coordinates"]["coordinates"][0], tw["coordinates"]["coordinates"][1]), tw) for tw in geos]

    # Local counters for this time
    inTargetFreqMap[t] = {"count": 0, "list": []}
    
    # For each point, check if it is within the bounding box or not
    for point in convPoints:
        x = point[0][0]
        y = point[0][1]

        if ( boundingBox.contains_point((x, y))):
            inTargetFreqMap[t]["list"].append(point[1])
            plottablePointsX.append(x)
            plottablePointsY.append(y)

# Plot points in our target
targetMap.scatter(plottablePointsX, plottablePointsY, s=100, marker='x', color="red", zorder=2)
            
# Count the number of tweets that fall in the area
targetTweetCount = np.sum([len(inTargetFreqMap[t]["list"]) for t in sortedTimes])
            
print ("Tweets in Target Area:", targetTweetCount)
print ("Tweets outside:", (geoCount - targetTweetCount))

plt.show()

Geographically Relevant Tweet Content

Now that we have a list of tweets from the target area, what are they saying?


In [ ]:
# Merge our list of relevant tweets
geoRelevantTweets = [tw for x in sortedTimes for tw in inTargetFreqMap[x]["list"]]

print("Time of Crisis:", crisisTime)

# Print the first few tweets
for tweet in geoRelevantTweets[:10]:
    print("Tweet By:", tweet["user"]["screen_name"])
    print("\t", "Tweet Text:", tweet["text"])
    print("\t", "Tweet Time:", tweet["created_at"])
    print("\t", "Source:", tweet["source"])
    print("\t", "Retweets:", tweet["retweet_count"])
    print("\t", "Favorited:", tweet["favorite_count"])
    print("\t", "Twitter's Guessed Language:", tweet["lang"])
    if ( "place" in tweet ):
        print("\t", "Tweet Location:", tweet["place"]["full_name"])
    print("-----")

Media from Within Target

With this filtered list of tweets, we can extract media posted from the evnet.


In [ ]:
from IPython.display import display
from IPython.display import Image

geoTweetsWithMedia = list(filter(lambda tweet: "media" in tweet["entities"], geoRelevantTweets))
print ("Tweets with Media:", len(geoTweetsWithMedia))

if ( len(geoTweetsWithMedia) == 0 ):
    print ("Sorry, not tweets with media...")

for tweet in geoTweetsWithMedia:
    imgUrl = tweet["entities"]["media"][0]["media_url"]
    print (tweet["text"])
    display(Image(url=imgUrl))

Topic 6: Content and Sentiment Analysis

Another popular type of analysis people do on social networks is "sentiment analysis," which is used to figure out how people feel about a specific topic. Some tools also provide measurements like subjectivity/objectivity of text content.

We'll cover:

  • Topically Relevant Filtering
  • Sentiment, Subjectivity, and Objectivity

Topically Relevant Tweets

Before we filter for sentiment and such, we've seen that Twitter has a lot of noise and irrelevant data. We should clean this data a bit before this analysis. To do so, we'll filter our data so that it only contains tweets with relevant keywords.


In [ ]:
# What keywords are we interested in?
targetKeywords = crisisInfo[selectedCrisis]["keywords"]

# Map for storing topically relevant data
topicRelevantMap = {}

# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
    timeObj = frequencyMap[t]
    topicRelevantMap[t] = {"count": 0, "list": []}
    
    for tweetObj in timeObj["list"]:
        tweetString = tweetObj["text"].lower()

        # Add to the counter if the target keyword is in this tweet
        for keyword in targetKeywords:
            if ( keyword.lower() in tweetString ):
                topicRelevantMap[t]["list"].append(tweetObj)
                topicRelevantMap[t]["count"] += 1
                
                break

        
# Now plot the total frequency and frequency of each keyword
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)

plt.title("Tweet Frequency")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

ax.semilogy(range(len(frequencyMap)), totalCount, label="Total")

ax.scatter([crisisXCoord], [100], c="r", marker="x", s=100, label="Crisis")

relYData = [topicRelevantMap[t]["count"] for t in sortedTimes]
ax.semilogy(range(len(relYData)), relYData, label="Relevant")

ax.legend()
ax.grid(b=True, which=u'major')

plt.show()

Highly Important Relevant Tweets


In [ ]:
allTweets = [x for t in sortedTimes for x in topicRelevantMap[t]["list"]]

# get the top retweeted tweets
onlyRetweets = filter(lambda x: "retweeted_status" in x, allTweets)
topTweets = sorted(onlyRetweets, key=lambda x: x["retweeted_status"]["retweet_count"], 
                   reverse=True)[:10]

print("Top Retweets:")
for x in topTweets:
    print(x["id"], x["user"]["screen_name"], x["retweeted_status"]["retweet_count"], x["text"])

# get tweets from users with the msot followers
topTweets = sorted(allTweets, key=lambda x: x["user"]["followers_count"], reverse=True)[:10]

print()
print("Top Accounts:")
for x in topTweets:
    print(x["id"], x["user"]["screen_name"], x["user"]["followers_count"], x["text"])
    
    
# get the top retweeted tweets but only from verified accounts
verifiedTweets = filter(lambda x: x["retweeted_status"]["user"]["verified"], onlyRetweets)
topTweets = sorted(verifiedTweets, key=lambda x: x["retweeted_status"]["retweet_count"], 
                   reverse=True)[:10]

print()
print("Top Retweets from Verified Accounts:")
for x in verifiedTweets:
    print(x["id"], x["user"]["screen_name"], x["retweet_count"], x["text"])

Quick Geo-data Comparison

An interesting comparison might be to look at the areas of concentration of relevant tweets.


In [ ]:
# A frequency map for timestamps to geo-coded tweets
relGeoFreqMap = {}
relGeoCount = 0

# Save only those tweets with tweet['coordinate']['coordinate'] entity
for t in sortedTimes:
    geos = list(filter(lambda tweet: tweet["coordinates"] != None and 
                       "coordinates" in tweet["coordinates"], 
                       topicRelevantMap[t]["list"]))
    relGeoCount += len(geos)
    
    # Add to the timestamp map
    relGeoFreqMap[t] = {"count": len(geos), "list": geos}

print ("Number of Relevant Geo Tweets:", relGeoCount)

# Create a list of all geo-coded tweets
tmpGeoList = [relGeoFreqMap[t]["list"] for t in sortedTimes]
relGeoTweets = functools.reduce(lambda x, y: x + y, tmpGeoList)

# For each geo-coded tweet, extract its GPS coordinates
relGeoCoord = [x["coordinates"]["coordinates"] for x in relGeoTweets]

fig, ax = plt.subplots(figsize=(24,24))
worldMap = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80,
                   llcrnrlon=-180, urcrnrlon=180, resolution='l')

worldMap.fillcontinents(color=land_color, lake_color=water_color, zorder=1)
worldMap.drawcoastlines()
worldMap.drawparallels(np.arange(-90.,120.,30.))
worldMap.drawmeridians(np.arange(0.,420.,60.))
worldMap.drawmapboundary(fill_color=water_color, zorder=0)
worldMap.drawcountries()
ax.set_title('Global Relevant Tweets')

# Convert points from GPS coordinates to (x,y) coordinates
allConvPoints = [worldMap(p[0], p[1]) for p in geoCoord]
x = [p[0] for p in allConvPoints]
y = [p[1] for p in allConvPoints]
worldMap.scatter(x, y, s=100, marker='x', color="blue", zorder=2)

# Convert points from GPS coordinates to (x,y) coordinates
relConvPoints = [worldMap(p[0], p[1]) for p in relGeoCoord]
x = [p[0] for p in relConvPoints]
y = [p[1] for p in relConvPoints]
worldMap.scatter(x, y, s=100, marker='x', color="red", zorder=2)

plt.show()

Observation: Most topically relevant tweets are not geotagged.

Sentiment Analysis w/ TextBlob

TextBlob is a nice Python package that provides a number of useful text processing capabilities. We will use it for sentiment analysis to calculate polarity and subjectivity for each relevant tweet.


In [ ]:
from textblob import TextBlob

# Sentiment values
polarVals = []
objVals = []

# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
    timeObj = topicRelevantMap[t]
    
    # For calculating averages
    localPolarVals = []
    localObjVals = []
    
    for tweetObj in timeObj["list"]:
        tweetString = tweetObj["text"].lower()

        blob = TextBlob(tweetString)
        polarity = blob.sentiment.polarity
        objectivity = blob.sentiment.subjectivity
        
        localPolarVals.append(polarity)
        localObjVals.append(objectivity)
        
    # Add data to the polarity and objectivity measure arrays
    if ( len(timeObj["list"]) > 10 ):
        polarVals.append(np.mean(localPolarVals))
        objVals.append(np.mean(localObjVals))
    else:
        polarVals.append(0.0)
        objVals.append(0.0)

        
# Now plot this sentiment data
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)

plt.title("Sentiment")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

xData = range(len(sortedTimes))

ax.scatter([crisisXCoord], [0], c="r", marker="x", s=100, label="Crisis")

# Polarity is scaled [-1, 1], for negative and positive polarity
ax.plot(xData, polarVals, label="Polarity")

# Subjetivity is scaled [0, 1], with 0 = objective, 1 = subjective
ax.plot(xData, objVals, label="Subjectivity")

ax.legend()
ax.grid(b=True, which=u'major')

plt.show()

Sentiment Analysis with Vader


In [ ]:
import nltk
nltk.download("vader_lexicon")
import nltk.sentiment.util
import nltk.sentiment.vader

In [ ]:
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()

In [ ]:
# Sentiment values
polarVals = []

# For each minute, pull the tweet text and search for the keywords we want
for t in sortedTimes:
    timeObj = topicRelevantMap[t]
    
    # For calculating averages
    localPolarVals = []
    
    for tweetObj in timeObj["list"]:
        tweetString = tweetObj["text"].lower()

        polarity = vader.polarity_scores(tweetString)["compound"]
        
        localPolarVals.append(polarity)
        
    # Add data to the polarity and objectivity measure arrays
    if ( len(timeObj["list"]) > 10 ):
        polarVals.append(np.mean(localPolarVals))
    else:
        polarVals.append(0.0)

        
# Now plot this sentiment data
fig, ax = plt.subplots()
fig.set_size_inches(11, 8.5)

plt.title("Sentiment")
plt.xticks(smallerXTicks, [sortedTimes[x] for x in smallerXTicks], rotation=90)

xData = range(len(sortedTimes))

ax.scatter([crisisXCoord], [0], c="r", marker="x", s=100, label="Crisis")

# Polarity is scaled [-1, 1], for negative and positive polarity
ax.plot(xData, polarVals, label="Polarity")

ax.legend()
ax.grid(b=True, which=u'major')

plt.ylim((-0.3, 0.55))
plt.show()

Topic 7: Topic Modeling

Along with sentiment analysis, a question often asked of social networks is "What are people talking about?" We can answer this question using tools from topic modeling and natural language processing. With crises, people can have many responses, from sharing specific data about the event, sharing condolonces, or opening their homes to those in need.

To generate these topic models, we will use the Gensim package's implementation of Latent Dirichlet Allocation (LDA), which basically constructs a set of topics where each topic is described as a probability distribution over the words in our tweets. Several other methods for topic modeling exist as well.


In [ ]:
# Gotta pull in a bunch of packages for this
import gensim.models.ldamulticore
import gensim.matutils
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

In [ ]:
nltk.download("stopwords")
from nltk.corpus import stopwords

We first extract all relevant tweets' text for building our models.


In [ ]:
# Get all tweets and conver to lowercase
allTweetText = [x["text"].lower() for t in sortedTimes for x in topicRelevantMap[t]["list"]]

print ("All Tweet Count:", len(allTweetText))

Now we build a list of stop words (words we don't care about) and build a feature generator (the vectorizer) that assigns integer keys to tokens and counts the number of each token.


In [ ]:
enStop = stopwords.words('english')
esStop = stopwords.words('spanish')

# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop + esStop + ["http", "https", "rt", "@", ":", "co"]

vectorizer = sklearn.feature_extraction.text.CountVectorizer(strip_accents='unicode', 
                                                             tokenizer=None,
                                                             token_pattern='(?u)#?\\b\\w+[\'-]?\\w+\\b',
                                                             stop_words=stopList)

# Analyzer
analyze = vectorizer.build_analyzer() 

# Create a vectorizer for all our content
vectorizer.fit(allTweetText)

# Get all the words in our text
names = vectorizer.get_feature_names()

# Create a map for vectorizer IDs to words
id2WordDict = dict(zip(range(len(vectorizer.get_feature_names())), names))

We then use the vectorizer to transform our tweet text into a feature set, which essentially is a table with rows of tweets, columns for each keyword, and each cell is the number of times that keyword appears in that tweet.

We then convert that table into a model the Gensim package can handle, apply LDA, and grab the top 10 topics, 10 words that describe that topic, and print them.


In [ ]:
# Create a corpus for 
corpus = vectorizer.transform(allTweetText)
gsCorpus = gensim.matutils.Sparse2Corpus(corpus, documents_columns=False)
        
lda = gensim.models.LdaMulticore(gsCorpus, 
                                 id2word=id2WordDict,
                                 num_topics=20, 
                                 passes=2) # ++ passes for better results

ldaTopics = lda.show_topics(num_topics=10, 
                            num_words=10, 
                            formatted=False)

for (i, tokenList) in ldaTopics:
    print ("Topic %d:" % i, ' '.join([pair[0] for pair in tokenList]))

We can also be a little more strict and get rid of some noise by looking only at words with more than X characters. Stop words are often short, so by putting a floor on the length of a token, we can theoretically get higher-signal data.


In [ ]:
docArrays = filter(lambda x: len(x) > 4, [y for x in allTweetText for y in analyze(x)])
fd = nltk.FreqDist(docArrays)

print ("Most common from analyzer:")
for x in fd.most_common(20):
    print (x[0], x[1])

Topic 8: Network Analysis

Information flows and social networks are important considerations during crises, when people are trying to get updates on safe spaces, loved ones, places of shelter, etc. Twitter is noisy though, and a lot of the data may be irrelevant, condolences/thoughts expressed by celebrities, or otherwise uninformative. Using network analysis, we can get some idea about who the most important Twitter users were during this time, and how people split into groups online.

For this analysis, we'll use the NetworkX package to construct a social graph of how people interact. Each person in our Twitter data will be a node in our graph, and edges in the graph will represent mentions during this timeframe. Then we will explore a few simple analytical methods in network analysis, including:

  • Central accounts
  • Visualization

Graph Building

To limit the amount of data we're looking at, we'll only build the network for people who tweeted about a relevant keyword and the people they mention. We build this network simply by iterating through all the tweets in our relevant list and extract the "user_mentions" list from the "entities" section of the tweet object. For each mention a user makes, we will add an edge from that user to the user he/she mentioned.


In [ ]:
import networkx as nx

# We'll use a directed graph since mentions/retweets are directional
graph = nx.DiGraph()
    
for tweet in [x for t in sortedTimes for x in topicRelevantMap[t]["list"]]:
    userName = tweet["user"]["screen_name"]
    graph.add_node(userName)

    mentionList = tweet["entities"]["user_mentions"]

    for otherUser in mentionList:
        otherUserName = otherUser["screen_name"]
        if ( graph.has_node(otherUserName) == False ):
            graph.add_node(otherUserName)
        graph.add_edge(userName, otherUserName)
        
print ("Number of Users:", len(graph.node))

Central Users

In network analysis, "centrality" is used to measure the importance of a given node. Many different types of centrality are used to describe various types of importance though. Examples include "closeness centrality," which measures how close a node is to all other nodes in the network, versus "betweeness centrality," which measures how many shortest paths run through the given node. Nodes with high closeness centrality are important for rapidly disseminating information or spreading disease, whereas nodes with high betweeness are more important to ensure the network stays connected.

The PageRank is another algorithm for measuring importance and was proposed by Sergey Brin and Larry Page for the early version of Google's search algorithm. NetworkX has an implementation of the PageRank algorithm that we can use to look at the most important/authoritative users on Twitter based on their connections to other users.


In [ ]:
# Now we prune for performance reasons
# remove all nodes with few edges

nodeList = [n for n,d in graph.degree_iter() if d<2]
graph.remove_nodes_from(nodeList)
print ("Number of Remaining Users:", len(graph.node))

In [ ]:
# THis may take a while
pageRankList = nx.pagerank_numpy(graph)

In [ ]:
highRankNodes = sorted(pageRankList.keys(), key=pageRankList.get, reverse=True)
for x in highRankNodes[:20]:
    print (x, pageRankList[x])

In [ ]:
plt.figure(figsize=(8,8))
pos = nx.spring_layout(graph, scale=100, iterations=100, k=0.2)
nx.draw(graph, 
        pos, 
        node_color='#A0CBE2', 
        width=1, 
        with_labels=False,
        node_size=50)

hrNames = highRankNodes[:20]
hrDict = dict(zip(hrNames, hrNames))
hrValues = [pageRankList[x] for x in hrNames]

nx.draw_networkx_nodes(graph,pos,nodelist=hrNames,
                       node_size=200,
                       node_color=hrValues,
                       cmap=plt.cm.Reds_r)

nx.draw_networkx_labels(graph,
                        pos,
                        labels=hrDict,
                        fontsize=36,
                        font_color="g")

plt.axis('off')
plt.show()

In [ ]: